# ██████╗ ██╗ ██╗██╗ ██╗██╗ ██████╗ ██████╗ ██╗ ██╗███████╗██████╗ ███████╗
# ██╔══██╗██║ ██║╚██╗ ██╔╝██║ ██╔═══██╗██╔══██╗██║ ██║██╔════╝██╔══██╗██╔════╝
# ██████╔╝███████║ ╚████╔╝ ██║ ██║ ██║██████╔╝███████║█████╗ ██████╔╝█████╗
# ██╔═══╝ ██╔══██║ ╚██╔╝ ██║ ██║ ██║██╔═══╝ ██╔══██║██╔══╝ ██╔══██╗██╔══╝
# ██║ ██║ ██║ ██║ ███████╗╚██████╔╝██║ ██║ ██║███████╗██║ ██║███████╗
# ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚══════╝
## PHYLOPHERE: A Nextflow pipeline including a complete set
## of phylogenetic comparative tools and analyses for Phenome-Genome studies
### Github: https://github.com/nozerorma/caastools/nf-phylophere
### Author: Miguel Ramon (miguel.ramon@upf.edu)
#### File: CI-composition.Rmd
This script computes 95% credibility intervals (CIs) for the trait of interest:
This section configures the working environment, sets directories, and loads necessary functions and libraries.
# Call the setup function from commons.R
source("./obj/commons.R")
## [DEBUG] args = cancer_traits_processed-LQ.csv | science.abn7829_data_s4.nex.tree | data_exploration | 1998 | primates | family | malignant_prevalence | adult_necropsy_count | malignant_count | /home/miguel/IBE-UPF/PhD/PhyloPhere/Data/5.Phylogeny/taxid_species_family.tsv | neoplasia_prevalence | LQ
## [DEBUG] seed_val = 1998
## [DEBUG] workingDir = /home/miguel/IBE-UPF/PhD/PhyloPhere/work/e6/30761e30fd9009c710237054109a64
## [DEBUG] objDir = /home/miguel/IBE-UPF/PhD/PhyloPhere/work/e6/30761e30fd9009c710237054109a64/obj
## [DEBUG] resultsDir = data_exploration
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] "Working Directory: /home/miguel/IBE-UPF/PhD/PhyloPhere/work/e6/30761e30fd9009c710237054109a64"
## [1] "Results Directory: data_exploration"
## [DEBUG] Working Directory: /home/miguel/IBE-UPF/PhD/PhyloPhere/work/e6/30761e30fd9009c710237054109a64
## [DEBUG] Results Directory: data_exploration
## [DEBUG] data_exploration_dir = data_exploration/1.Data-exploration
## [DEBUG] species_distribution_dir = data_exploration/1.Data-exploration/1.Species_distribution
## [DEBUG] extreme_plots_dir = data_exploration/1.Data-exploration/2.Extreme_plots
## [DEBUG] asr_trees = data_exploration/1.Data-exploration/3.ASR_trees
## [DEBUG] phylo_distribution_dir = data_exploration/1.Data-exploration/4.Phylogenetic_distribution
## [DEBUG] ci_dir = data_exploration/1.Data-exploration/5.CI_overlaps
## [DEBUG] palettes loaded: primates=15, primates_dark=15, mammals=17
## [1] "Loading trait data from: cancer_traits_processed-LQ.csv"
## [DEBUG] trait_path = cancer_traits_processed-LQ.csv
## [DEBUG] trait_df rows = 47, cols = 19
## [DEBUG] trait_df columns: tax_id, species, common_name, family, adult_necropsy_count, neoplasia_necropsy, neoplasia_prevalence, benign_count, benign_prevalence, malignant_count, malignant_prevalence, body_mass, mass_g, log_body_mass, mature_f, mature_m, mature_age, MLS.anage, LQ
## [DEBUG] trait_df species unique = 47
## [DEBUG] tree_path = science.abn7829_data_s4.nex.tree
## [DEBUG] tree tips = 236, nodes = 235
## [DEBUG] clade_name = primates
## [DEBUG] taxon_of_interest = family
## [DEBUG] trait = malignant_prevalence
## [DEBUG] n_trait = malignant_count
## [DEBUG] p_trait = adult_necropsy_count
## [DEBUG] has.p = TRUE, p missing = 0
## [DEBUG] has.n = TRUE, n missing = 0
## Warning: package 'ape' was built under R version 4.4.2
##
## Attaching package: 'ape'
## The following object is masked from 'package:dplyr':
##
## where
## [DEBUG] tax_id_file = /home/miguel/IBE-UPF/PhD/PhyloPhere/Data/5.Phylogeny/taxid_species_family.tsv
## [DEBUG] tax_id_df rows = 1290, distinct taxa = 807
## [DEBUG] has.TAX_ID = TRUE
## [DEBUG] trait_df merged with tax_id_df: rows = 47, missing tax_id = 0
## [DEBUG] normalized tax_id from merged columns, missing tax_id = 0
## [DEBUG] tree_ids rows = 40, missing tax_id = 0
## [DEBUG] common_tax_ids = 40
## [DEBUG] pruned_tree tips (TAX_ID) = 40, nodes = 39
## [DEBUG] trait_df after TAX_ID tree filter rows = 40
## [DEBUG] secondary_trait = neoplasia_prevalence
## [DEBUG] has.secondary = TRUE, missing = 0
## [DEBUG] branch_trait = LQ
## [DEBUG] has.branch = TRUE, missing = 0
setup_rmd()
# Debug helper (prints into HTML output)
if (is.null(getOption("phylo_debug"))) {
options(phylo_debug = TRUE)
}
if (!exists("phylo_debug_log", envir = .GlobalEnv)) {
phylo_debug_log <- character()
}
if (!exists("debug_log", inherits = TRUE)) {
debug_log <- function(...) {
msg <- sprintf(...)
phylo_debug_log <<- c(phylo_debug_log, msg)
cat("[DEBUG] ", msg, "\n", sep = "")
}
}
# Load additional libraries
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(patchwork)
library(ggtext)
color_palette <- paste0(clade_name, "_palette") # Color palette for the clade
capitalized_taxon <- tools::toTitleCase(taxon_of_interest) # Capitalized taxon name
capitalized_trait <- tools::toTitleCase(gsub("_", " ", trait)) # Capitalized and deconvoluted trait name
capitalized_clade <- tools::toTitleCase(gsub("_", " ", clade_name)) # Capitalized and deconvoluted clade name
createDir(ci_dir)
## [DEBUG] createDir: created data_exploration/1.Data-exploration/5.CI_overlaps
This section calculates 95% confidence intervals for malignant and neoplasia prevalence per species using both Wilson and Jeffreys methods.
# Compute CIs directly using DescTools and binom packages
trait_ci <- trait_df %>%
group_by(species) %>%
dplyr::rename(n_population = .data[[p_trait]],
n_observation = .data[[n_trait]],
trait = .data[[trait]]) %>%
dplyr::mutate(
# Jeffreys method CIs
trait_ci_lb = binom::binom.confint(n_observation, n_population,
method = "bayes", priors = c(0.5, 0.5),
conf.level = 0.95)[, 5],
trait_ci_ub = binom::binom.confint(n_observation, n_population,
method = "bayes", priors = c(0.5, 0.5),
conf.level = 0.95)[, 6],
trait_ci = paste0("[", round(trait_ci_lb, 2), "-", round(trait_ci_ub, 2), "]")
) %>%
ungroup()
## Warning: Use of .data in tidyselect expressions was deprecated in tidyselect 1.2.0.
## ℹ Please use `all_of(var)` (or `any_of(var)`) instead of `.data[[var]]`
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Save results
write.csv(trait_ci, file.path(ci_dir, "trait_CI.csv"),
quote = FALSE, row.names = FALSE)
# Display preview
head(trait_ci)
## # A tibble: 6 × 22
## species common_name family n_population neoplasia_necropsy
## <chr> <chr> <chr> <int> <int>
## 1 Alouatta_caraya Black howler Ateli… 32 4
## 2 Ateles_geoffroyi Black-handed spid… Ateli… 43 13
## 3 Callimico_goeldii Goeldi's monkey Cebid… 69 15
## 4 Callithrix_geoffroyi White-fronted mar… Cebid… 31 3
## 5 Callithrix_jacchus Common marmoset Cebid… 41 1
## 6 Cebuella_pygmaea Pygmy marmoset Cebid… 41 4
## # ℹ 17 more variables: neoplasia_prevalence <dbl>, benign_count <int>,
## # benign_prevalence <dbl>, n_observation <int>, trait <dbl>, body_mass <dbl>,
## # mass_g <dbl>, log_body_mass <dbl>, mature_f <dbl>, mature_m <dbl>,
## # mature_age <dbl>, MLS.anage <dbl>, LQ <dbl>, tax_id <int>,
## # trait_ci_lb <dbl>, trait_ci_ub <dbl>, trait_ci <chr>
This section prepares data for comparing cancer prevalence between all pairs of species.
# Helper function to create pairwise comparisons for a given method
create_pairwise_data <- function(ci_data) {
# Define column names for lower and upper bounds
ci_subset <- ci_data %>%
dplyr::select(species, n_population,
n_observation, trait,
trait_ci_lb, trait_ci_ub)
# Generate all pairwise species combinations
pairwise <- expand.grid(species1 = ci_subset$species,
species2 = ci_subset$species,
stringsAsFactors = FALSE)
# Merge species data for each pair
pairwise <- pairwise %>%
left_join(ci_subset %>% rename_with(~paste0(.x, "1"), .cols = -species),
by = c("species1" = "species")) %>%
left_join(ci_subset %>% rename_with(~paste0(.x, "2"), .cols = -species),
by = c("species2" = "species"))
pairwise <- pairwise %>%
mutate(
trait_diff = trait1 - trait2,
trait_overlap = ci_overlap(
trait_ci_lb1, trait_ci_ub1, trait_ci_lb2, trait_ci_ub2
),
diff_value = trait_diff,
overlap_use = trait_overlap
)
return(pairwise)
}
# Create pairwise data
pairwise_data <- create_pairwise_data(trait_ci)
# Save results
write.csv(pairwise_data, file.path(ci_dir, "pairwise_data.csv"),
quote = FALSE, row.names = FALSE)
# Display preview
head(pairwise_data)
## species1 species2 n_population1 n_observation1 trait1
## 1 Alouatta_caraya Alouatta_caraya 32 1 0.03125000
## 2 Ateles_geoffroyi Alouatta_caraya 43 8 0.18604651
## 3 Callimico_goeldii Alouatta_caraya 69 7 0.10144928
## 4 Callithrix_geoffroyi Alouatta_caraya 31 1 0.03225806
## 5 Callithrix_jacchus Alouatta_caraya 41 1 0.02439024
## 6 Cebuella_pygmaea Alouatta_caraya 41 2 0.04878049
## trait_ci_lb1 trait_ci_ub1 n_population2 n_observation2 trait2 trait_ci_lb2
## 1 6.362256e-05 0.11585204 32 1 0.03125 6.362256e-05
## 2 8.440834e-02 0.30990210 32 1 0.03125 6.362256e-05
## 3 4.082088e-02 0.18007430 32 1 0.03125 6.362256e-05
## 4 6.621844e-05 0.11938682 32 1 0.03125 6.362256e-05
## 5 4.695278e-05 0.09147094 32 1 0.03125 6.362256e-05
## 6 4.027234e-03 0.13016665 32 1 0.03125 6.362256e-05
## trait_ci_ub2 trait_diff trait_overlap diff_value overlap_use
## 1 0.115852 0.000000000 TRUE 0.000000000 TRUE
## 2 0.115852 0.154796512 TRUE 0.154796512 TRUE
## 3 0.115852 0.070199275 TRUE 0.070199275 TRUE
## 4 0.115852 0.001008065 TRUE 0.001008065 TRUE
## 5 0.115852 -0.006859756 TRUE -0.006859756 TRUE
## 6 0.115852 0.017530488 TRUE 0.017530488 TRUE
This section transforms pairwise data into matrix format for visualization.
# Helper function to build difference matrix
build_diff_matrix <- function(pairwise_data) {
pairwise_data %>%
select(species1, species2, diff_value) %>%
pivot_wider(names_from = species2, values_from = diff_value) %>%
column_to_rownames("species1") %>%
as.matrix()
}
# Create matrices
matrix <- build_diff_matrix(pairwise_data)
# Save matrices
write.csv(matrix, file.path(ci_dir, "diff_matrix-CI.csv"),
quote = FALSE, row.names = TRUE)
# Display preview
head(matrix)
## Alouatta_caraya Ateles_geoffroyi Callimico_goeldii
## Alouatta_caraya 0.000000000 -0.15479651 -0.07019928
## Ateles_geoffroyi 0.154796512 0.00000000 0.08459724
## Callimico_goeldii 0.070199275 -0.08459724 0.00000000
## Callithrix_geoffroyi 0.001008065 -0.15378845 -0.06919121
## Callithrix_jacchus -0.006859756 -0.16165627 -0.07705903
## Cebuella_pygmaea 0.017530488 -0.13726602 -0.05266879
## Callithrix_geoffroyi Callithrix_jacchus Cebuella_pygmaea
## Alouatta_caraya -0.001008065 0.006859756 -0.01753049
## Ateles_geoffroyi 0.153788447 0.161656268 0.13726602
## Callimico_goeldii 0.069191211 0.077059031 0.05266879
## Callithrix_geoffroyi 0.000000000 0.007867821 -0.01652242
## Callithrix_jacchus -0.007867821 0.000000000 -0.02439024
## Cebuella_pygmaea 0.016522423 0.024390244 0.00000000
## Cercopithecus_neglectus Colobus_guereza Eulemur_macaco
## Alouatta_caraya -0.004464286 -0.009987113 -0.16875000
## Ateles_geoffroyi 0.150332226 0.144809398 -0.01395349
## Callimico_goeldii 0.065734990 0.060212162 -0.09855072
## Callithrix_geoffroyi -0.003456221 -0.008979049 -0.16774194
## Callithrix_jacchus -0.011324042 -0.016846869 -0.17560976
## Cebuella_pygmaea 0.013066202 0.007543374 -0.15121951
## Galago_moholi Galago_senegalensis Gorilla_gorilla
## Alouatta_caraya -0.03125000 0.016098485 0.001399254
## Ateles_geoffroyi 0.12354651 0.170894996 0.156195765
## Callimico_goeldii 0.03894928 0.086297760 0.071598529
## Callithrix_geoffroyi -0.03024194 0.017106549 0.002407318
## Callithrix_jacchus -0.03810976 0.009238729 -0.005460502
## Cebuella_pygmaea -0.01371951 0.033628973 0.018929742
## Hylobates_lar Lemur_catta Leontopithecus_chrysomelas
## Alouatta_caraya -0.12784091 -0.09989754 -0.08639706
## Ateles_geoffroyi 0.02695560 0.05489897 0.06839945
## Callimico_goeldii -0.05764163 -0.02969827 -0.01619778
## Callithrix_geoffroyi -0.12683284 -0.09888948 -0.08538899
## Callithrix_jacchus -0.13470067 -0.10675730 -0.09325681
## Cebuella_pygmaea -0.11031042 -0.08236705 -0.06886657
## Leontopithecus_rosalia Macaca_fascicularis Macaca_fuscata
## Alouatta_caraya -0.024305556 0.03125000 0.007994186
## Ateles_geoffroyi 0.130490956 0.18604651 0.162790698
## Callimico_goeldii 0.045893720 0.10144928 0.078193461
## Callithrix_geoffroyi -0.023297491 0.03225806 0.009002251
## Callithrix_jacchus -0.031165312 0.02439024 0.001134430
## Cebuella_pygmaea -0.006775068 0.04878049 0.025524674
## Macaca_nigra Macaca_silenus Mandrillus_sphinx
## Alouatta_caraya 0.003472222 0.001838235 -0.013194444
## Ateles_geoffroyi 0.158268734 0.156634747 0.141602067
## Callimico_goeldii 0.073671498 0.072037511 0.057004831
## Callithrix_geoffroyi 0.004480287 0.002846300 -0.012186380
## Callithrix_jacchus -0.003387534 -0.005021521 -0.020054201
## Cebuella_pygmaea 0.021002710 0.019368723 0.004336043
## Mico_argentatus Microcebus_murinus Nycticebus_coucang
## Alouatta_caraya 0.03125000 -0.21436404 -0.05208333
## Ateles_geoffroyi 0.18604651 -0.05956752 0.10271318
## Callimico_goeldii 0.10144928 -0.14416476 0.01811594
## Callithrix_geoffroyi 0.03225806 -0.21335597 -0.05107527
## Callithrix_jacchus 0.02439024 -0.22122379 -0.05894309
## Cebuella_pygmaea 0.04878049 -0.19683355 -0.03455285
## Nycticebus_pygmaeus Pan_troglodytes Papio_hamadryas
## Alouatta_caraya -0.19323980 0.02072368 0.02154126
## Ateles_geoffroyi -0.03844328 0.17552020 0.17633777
## Callimico_goeldii -0.12304052 0.09092296 0.09174054
## Callithrix_geoffroyi -0.19223173 0.02173175 0.02254933
## Callithrix_jacchus -0.20009955 0.01386393 0.01468151
## Cebuella_pygmaea -0.17570931 0.03825417 0.03907175
## Propithecus_coquereli Saguinus_bicolor Saguinus_imperator
## Alouatta_caraya -0.04875000 -0.008750000 0.03125000
## Ateles_geoffroyi 0.10604651 0.146046512 0.18604651
## Callimico_goeldii 0.02144928 0.061449275 0.10144928
## Callithrix_geoffroyi -0.04774194 -0.007741935 0.03225806
## Callithrix_jacchus -0.05560976 -0.015609756 0.02439024
## Cebuella_pygmaea -0.03121951 0.008780488 0.04878049
## Saguinus_midas Saguinus_oedipus Saimiri_sciureus
## Alouatta_caraya 0.03125000 -0.04337687 -0.02047414
## Ateles_geoffroyi 0.18604651 0.11141965 0.13432237
## Callimico_goeldii 0.10144928 0.02682241 0.04972514
## Callithrix_geoffroyi 0.03225806 -0.04236880 -0.01946607
## Callithrix_jacchus 0.02439024 -0.05023662 -0.02733389
## Cebuella_pygmaea 0.04878049 -0.02584638 -0.00294365
## Sapajus_apella Theropithecus_gelada Trachypithecus_auratus
## Alouatta_caraya 0.03125000 0.012019231 -0.005787037
## Ateles_geoffroyi 0.18604651 0.166815742 0.149009475
## Callimico_goeldii 0.10144928 0.082218506 0.064412238
## Callithrix_geoffroyi 0.03225806 0.013027295 -0.004778973
## Callithrix_jacchus 0.02439024 0.005159475 -0.012646793
## Cebuella_pygmaea 0.04878049 0.029549719 0.011743451
## Trachypithecus_cristatus Trachypithecus_francoisi
## Alouatta_caraya 0.03125000 -0.068750000
## Ateles_geoffroyi 0.18604651 0.086046512
## Callimico_goeldii 0.10144928 0.001449275
## Callithrix_geoffroyi 0.03225806 -0.067741935
## Callithrix_jacchus 0.02439024 -0.075609756
## Cebuella_pygmaea 0.04878049 -0.051219512
## Varecia_rubra Varecia_variegata
## Alouatta_caraya -0.10434322 -0.13980263
## Ateles_geoffroyi 0.05045329 0.01499388
## Callimico_goeldii -0.03414394 -0.06960336
## Callithrix_geoffroyi -0.10333516 -0.13879457
## Callithrix_jacchus -0.11120298 -0.14666239
## Cebuella_pygmaea -0.08681273 -0.12227214
This section creates heatmaps to visualize pairwise differences in prevalence. Species labels are colored based on prevalence quantiles.
# Simplified function to create colored labels based on trait quantiles
create_colored_labels <- function(trait_data) {
# Calculate quantile thresholds
q75 <- quantile(trait_data$trait, 0.75, na.rm = TRUE)
q25 <- quantile(trait_data$trait, 0.25, na.rm = TRUE)
# Create a lookup table with species and their colors
label_df <- trait_data %>%
mutate(
color = case_when(
trait >= q75 ~ "#D73027", # Red for top quartile
trait <= q25 ~ "#1A9850", # Green for bottom quartile
TRUE ~ "#000000" # Black for middle
)
) %>%
select(species, color)
# Create named vector of colored HTML labels
colored_labels <- setNames(
paste0("<span style='color:", label_df$color, ";'>", label_df$species, "</span>"),
label_df$species
)
return(colored_labels)
}
# Helper function to create legend plot
create_legend_plot <- function() {
legend_df <- tibble(
category = c("Top", "Bottom"),
color = c("#D73027", "#1A9850")
)
ggplot(legend_df, aes(x = 1, y = category, color = category)) +
geom_point(size = 0) +
scale_color_manual(
values = setNames(legend_df$color, legend_df$category),
guide = guide_legend(override.aes = list(size = 5))
) +
labs(color = "Trait Categories") +
theme_void() +
theme(
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)
}
# Helper function to create heatmap
create_heatmap <- function(pairwise_data, colored_labels,
highlight_nonoverlap = TRUE) {
# Ensure label mapping is stable across axes
if (is.null(names(colored_labels))) {
stop("colored_labels must be a named vector with species names.")
}
pairwise_data <- pairwise_data %>%
mutate(
species1 = factor(species1, levels = names(colored_labels)),
species2 = factor(species2, levels = names(colored_labels))
)
p <- ggplot(pairwise_data, aes(x = species2, y = species1, fill = abs(diff_value))) +
geom_tile(color = "black", size = 0.25) +
geom_text(aes(label = sprintf("%.2f", abs(diff_value))), size = 3) +
scale_fill_gradient(low = "white", high = "salmon3", name = "Difference") +
scale_x_discrete(labels = function(x) colored_labels[x]) +
scale_y_discrete(labels = function(y) colored_labels[y]) +
theme_minimal() +
theme(
axis.text.x = element_markdown(angle = 45, hjust = 1),
axis.text.y = element_markdown(angle = 0),
panel.grid = element_blank(),
legend.position = "none"
)
# Add bold border for non-overlapping CIs if requested
if (highlight_nonoverlap) {
p <- p +
geom_tile(data = filter(pairwise_data, !overlap_use),
fill = NA, color = "black", size = 1.5) +
labs(
x = "Species", y = "Species",
title = paste("Pairwise Differences in trait: ", capitalized_trait, " with Non-overlapping CIs", sep = ""),
caption = "Bold border: non-overlapping CIs."
)
} else {
p <- p +
labs(
x = "Species", y = "Species",
title = paste("Pairwise Differences in trait: ", capitalized_trait, sep = ""),
caption = "Bold border: non-overlapping CIs."
)
}
return(p)
}
# Create colored labels directly from trait data
my_labels_colored <- create_colored_labels(trait_ci)
# Save colored labels
write.csv(my_labels_colored, file.path(ci_dir, "my_labels_colored.csv"),
quote = FALSE, row.names = TRUE)
# Create legend
legend_plot <- create_legend_plot()
# Create heatmaps with non-overlapping CI highlights
p_overlap <- create_heatmap(pairwise_data, my_labels_colored, highlight_nonoverlap = TRUE) +
legend_plot + plot_layout(ncol = 2, widths = c(1000, 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p_flat <- create_heatmap(pairwise_data, my_labels_colored, highlight_nonoverlap = FALSE) +
legend_plot + plot_layout(ncol = 2, widths = c(1000, 1))
# Save heatmaps
ggsave(file.path(ci_dir, "visual_matrix_no_overlap.png"), p_overlap,
width = 15, height = 12, dpi = "retina")
ggsave(file.path(ci_dir, "visual_matrix_flat.png"), p_flat,
width = 15, height = 12, dpi = "retina")
# Display plots
p_overlap
p_flat